library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadísticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn 
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de Venn

1 Leyendo datos

Descargar los datos. Escalo los datos de la matriz para tener homogeneidad en las representaciones.

# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)

#datos corregidos 
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- scale(ROSMAP_RINPMIAGESEX_resids)

2 Geneset de alzheimer KEGG

Descarga del geneset de KEGG

# geneset de Alzheimer extraidos de KEGG

tab <- getGeneKEGGLinks(species="hsa")
tab$Symbol <- mapIds(org.Hs.eg.db, tab$GeneID,
                       column="SYMBOL", keytype="ENTREZID")

paths <- getKEGGPathwayNames(species="hsa")
geneset_alz <- tab$Symbol[tab$PathwayID=="hsa05010"]

Vamos a seleccionar solamente los genes de Alzheimer de KEGG en nuestro dataset de expresión génica.

mat_exp_alz_genes <- mat_exp[, colnames(mat_exp) %in% geneset_alz]

Visualizamos si se han seleccionado todos los genes y cuantos del geneset de KEGG. Nuestra matriz tiene 305 genes de un total 384 del geneset.

venn.plot <- venn.diagram(
  x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
  category.names = c("Matrix Genes", "Geneset Alzheimer"),
  filename = NULL,
  output = FALSE,  # Asegura que no se exporte a un archivo
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = 25, #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = "Alzheimer genes in exp matrix",
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

2.1 PCA

pca_alz_genes_scaled <- prcomp(mat_exp_alz_genes)
# Scree plot con los datos escalados
var_exp <- pca_alz_genes_scaled$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)

df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)

ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_line(aes(group = 1), color = "blue") +
  geom_point(color = "blue") +
  theme_minimal() +
  labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
  ylim(c(0,1)) +
  geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
  geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
  geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
  annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
  geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))

Vemos que con las 20 primeras no encontramos ni siquiera el 80% de la varianza. Las dos primeras PC tienen un 41% de la varizana explicada.

2.1.1 Seleccionar muestras con el doble de la desviación típica

Selecciono los outliers de las dos primeras PC con el doble de SD.

outlierspc1_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]),1])

outlierspc2_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]),2])
df <- as.data.frame(pca_alz_genes_scaled$x)

plot1 <- ggplot(df, aes(x = PC1)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC1) + 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC1) - 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
  labs(title = "PC1 density with 2*SD scaled PCA") +
  geom_text(data = outlierspc1_scaled, 
            aes(x = outlierspc1_scaled[,1], y= 0, label = rownames(outlierspc1_scaled)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]), "#8B1A1A", "grey")) +
  theme_minimal()

plot2 <- ggplot(df, aes(x = PC2)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC2) + 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC2) - 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
  labs(title = "PC2 density with 2*SD scaled PCA") +
  geom_text(data = outlierspc2_scaled, 
            aes(x = outlierspc2_scaled[,1], y= 0, label = rownames(outlierspc2_scaled)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]), "#8B1A1A", "grey")) +
  theme_minimal()

grid.arrange(plot1, plot2, ncol = 1)

mPC1_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1_scaled),]
mPC2_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2_scaled),]
mPC12_scaled <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2_scaled)) & (rownames(mat_exp) %in% rownames(outlierspc1_scaled)),]

not_in_PC12_scaled <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2_scaled)) | (rownames(mat_exp) %in% rownames(outlierspc1_scaled))),]
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1_scaled)), length(rownames(mPC2_scaled)), length(rownames(mPC12_scaled)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1_scaled <- c(rownames(mPC1_scaled), rep("", max_length - length(rownames(mPC1_scaled))))
rownames_mPC2_scaled <- c(rownames(mPC2_scaled), rep("", max_length - length(rownames(mPC2_scaled))))
rownames_mPC12_scaled <- c(rownames(mPC12_scaled), rep("", max_length - length(rownames(mPC12_scaled))))

final_table <- data.frame(
  mPC1 = rownames_mPC1_scaled,
  mPC2 = rownames_mPC2_scaled,
  mPC1_and_PC2 = rownames_mPC12_scaled
)

final_table

Voy ahora a ver de esas muestras seleccionadas, qué covariables están asociadas con esas muestras outliers

2.1.2 Covariables en outliers

rownames(covs) <- covs$mrna_id
covs2 <- covs

for (i in rownames(covs2)){
  if (i %in% rownames(mPC1_scaled) & !(i %in% rownames(mPC12_scaled))){
    covs2[i, "sampleset_PCA"] <- "mPC1"
  }
  if (i %in% rownames(mPC2_scaled) & !(i %in% rownames(mPC12_scaled))){
    covs2[i, "sampleset_PCA"] <- "mPC2"
  }
  if (i %in% rownames(mPC12_scaled)){
    covs2[i, "sampleset_PCA"] <- "mPC1 and mPC2"
  }
  if (!(i %in% rownames(mPC1_scaled)) & !(i %in% rownames(mPC2_scaled))){
    covs2[i, "sampleset_PCA"] <- "Not in both"
  }
}

covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))
pca.data.kegg <- data.frame(pca_alz_genes_scaled$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")

covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    filt <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_PCA) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_PCA == "mPC1" | sampleset_PCA == "mPC2" | sampleset_PCA == "mPC1 and mPC2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      group_by(across(all_of(col_name))) %>% 
      mutate(suma.total = sum(percentage)) %>%
      ungroup()
    
    p <- ggplot(filt, aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA")) +
        geom_bar(stat="identity", position=position_dodge()) +
        geom_text(aes(label=paste0(round(suma.total, 1), " %"), y = suma.total), label.size = 10, color = "black") +
        labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
        theme_minimal() 
    
    if (length(unique(covs2[[i]])) >= 2) {
      p <- p + 
        geom_segment(aes(x = 1, xend = 1, y = 0, yend = suma.total[1]-1), color = "red", size = 3) 
        p <- p + geom_segment(aes(x = 2, xend = 2, y = 0, yend = suma.total[3]-1), color = "red", size = 3)
      if (length(unique(covs2[[i]])) >= 3){
        p <- p + geom_segment(aes(x = 3, xend = 3, y = 0, yend = suma.total[5]-1), color = "red", size = 3)
        if (length(unique(covs2[[i]])) >= 4) {
          p <- p + geom_segment(aes(x = 4, xend = 4, y = 0, yend = suma.total[7]-1), color = "red", size = 3)
          if (length(unique(covs2[[i]])) >= 5) {
            p <- p + geom_segment(aes(x = 5, xend = 5, y = 0, yend = suma.total[9]-1), color = "red", size = 3)
          }
        }
      }
    } 
  }
    
    plots[[colnames(covs2)[i]]] <- p
  
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

plots.pca.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1_scaled) | mrna_id %in% rownames(outlierspc2_scaled), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 10, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.pca.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.pca.kegg$study, plots.pca.kegg$braaksc, ncol=1)

grid.arrange(plots.pca.kegg$ceradsc, plots.pca.kegg$cogdx, ncol=1)

grid.arrange(plots.pca.kegg$apoe_genotype, plots.pca.kegg$neuroStatus, ncol=1)

2.2 tSNE

set.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)

tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_id

2.2.1 Seleccionamos outliers

rownames(tsne.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 2 * sd.tsne2, ]

mtSNE1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1),]
mtSNE2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2),]
mtSNE12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1)) & (rownames(mat_exp) %in% rownames(outliers.tsne2)),]

not_in_tSNE12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1)) | (rownames(mat_exp) %in% rownames(outliers.tsne2))),]

2.2.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mtSNE1) & !(i %in% rownames(mtSNE12))){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1"
  }
  if (i %in% rownames(mtSNE2) & !(i %in% rownames(mtSNE12))){
    covs2[i, "sampleset_tSNE"] <- "mtSNE2"
  }
  if (i %in% rownames(mtSNE12)){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1 and mtSNE2"
  }
  if (!(i %in% rownames(mtSNE1)) & !(i %in% rownames(mtSNE2))){
    covs2[i, "sampleset_tSNE"] <- "Not in both"
  }
}

covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1)), length(rownames(mtSNE2)), length(rownames(mtSNE12)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1 <- c(rownames(mtSNE1), rep("", max_length - length(rownames(mtSNE1))))
rownames_tsne2 <- c(rownames(mtSNE2), rep("", max_length - length(rownames(mtSNE2))))
rownames_tsne12 <- c(rownames(mtSNE12), rep("", max_length - length(rownames(mtSNE12))))

final_table <- data.frame(
  mtSNE1 = rownames_tsne1,
  mtSNE2 = rownames_tsne2,
  mtSNE1_and_mtSNE2 = rownames_tsne12
)

final_table
colnames(tsne.data) <- c("tSNE1.KEGG", "tSNE2.KEGG")

covs2 <- merge(covs2, tsne.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_tSNE) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_tSNE == "mtSNE1" | sampleset_tSNE == "mtSNE2" | sampleset_tSNE == "mtSNE1 and mtSNE2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

plots.tsne.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.tsne1) | mrna_id %in% rownames(outliers.tsne2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.tsne.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.tsne.kegg$study, plots.tsne.kegg$braaksc, ncol=1)

grid.arrange(plots.tsne.kegg$ceradsc, plots.tsne.kegg$cogdx, ncol=1)

grid.arrange(plots.tsne.kegg$apoe_genotype, plots.tsne.kegg$neuroStatus, ncol=1)

2.3 UMAP

local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)

umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs2, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_id

2.3.1 Seleccionamos outliers

rownames(umap.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 2 * sd.umap2, ]

mUMAP1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1),]
mUMAP2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2),]
mUMAP12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1)) & (rownames(mat_exp) %in% rownames(outliers.umap2)),]

not_in_UMAP12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1)) | (rownames(mat_exp) %in% rownames(outliers.umap2))),]

2.3.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mUMAP1) & !(i %in% rownames(mUMAP12))){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1"
  }
  if (i %in% rownames(mUMAP2) & !(i %in% rownames(mUMAP12))){
    covs2[i, "sampleset_UMAP"] <- "mUMAP2"
  }
  if (i %in% rownames(mUMAP12)){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1 and mUMAP2"
  }
  if (!(i %in% rownames(mUMAP1)) & !(i %in% rownames(mUMAP2))){
    covs2[i, "sampleset_UMAP"] <- "Not in both"
  }
}

covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1)), length(rownames(mUMAP2)), length(rownames(mUMAP12)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1 <- c(rownames(mUMAP1), rep("", max_length - length(rownames(mUMAP1))))
rownames_umap2 <- c(rownames(mUMAP2), rep("", max_length - length(rownames(mUMAP2))))
rownames_umap12 <- c(rownames(mUMAP12), rep("", max_length - length(rownames(mUMAP12))))

final_table <- data.frame(
  mUMAP1 = rownames_umap1,
  mUMAP2 = rownames_umap2,
  mUMAP1_and_mUMAP2 = rownames_umap12
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_UMAP) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_UMAP == "mUMAP1" | sampleset_UMAP == "mUMAP2" | sampleset_UMAP == "mUMAP1 and mUMAP2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(umap.data) <- c("UMAP1.KEGG", "UMAP2.KEGG")

covs2 <- merge(covs2, umap.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.umap.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.umap.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.umap.kegg$study, plots.umap.kegg$braaksc, ncol=1)

grid.arrange(plots.umap.kegg$ceradsc, plots.umap.kegg$cogdx, ncol=1)

grid.arrange(plots.umap.kegg$apoe_genotype, plots.umap.kegg$neuroStatus, ncol=1)

3 Geneset de AD de Andrews et al 2023 Lancet review

geneset.AD.Andrews <- read_delim("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/geneset AD Andrews 2023 Lancet.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)

geneset.AD.Andrews <- unique(geneset.AD.Andrews$Gene)
mat.expre.AD.Andrews <- data.frame(mat_exp[, colnames(mat_exp) %in% geneset.AD.Andrews])
venn.plot <- venn.diagram(
  x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset.AD.Andrews),
  category.names = c("Matrix Genes", "Geneset Alzheimer"),
  filename = NULL,
  output = FALSE,  # Asegura que no se exporte a un archivo
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = 25, #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = "Alzheimer genes in exp matrix Andrews",
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

3.1 PCA

pca.AD.Andrews <- prcomp(mat.expre.AD.Andrews)

3.1.0.1 Seleccionar muestras con el doble de la desviación típica

Seleccionamos muestras que esten 2 veces la desviacion tipica para la PC1 y PC2

outlierspc1.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,1]) > 2*(pca.AD.Andrews$sdev[1]),1])

outlierspc2.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,2]) > 2*(pca.AD.Andrews$sdev[2]),2])

Las ploteamos

Creamos los grupos de muestras seleccionados de los outliers de PC1 y/o PC2

mPC1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1.Andrews),]
mPC2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2.Andrews),]
mPC12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) & (rownames(mat_exp) %in% rownames(outlierspc1.Andrews)),]

not_in_PC12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) | (rownames(mat_exp) %in% rownames(outlierspc1.Andrews))),]
rownames(covs2) <- covs2$mrna_id

for (i in rownames(covs2)){
  if (i %in% rownames(mPC1.Andrews) & !(i %in% rownames(mPC12.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC1"
  }
  if (i %in% rownames(mPC2.Andrews) & !(i %in% rownames(mPC12.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC2"
  }
  if (i %in% rownames(mPC12.Andrews)){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC1 and mPC2"
  }
  if (!(i %in% rownames(mPC1.Andrews)) & !(i %in% rownames(mPC2.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_PCA_Andrews <- as.factor(covs2$sampleset_PCA_Andrews)
data.frame(table(covs2$sampleset_PCA_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1.Andrews)), length(rownames(mPC2.Andrews)), length(rownames(mPC12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.Andrews <- c(rownames(mPC1.Andrews), rep("", max_length - length(rownames(mPC1.Andrews))))
rownames_mPC2.Andrews <- c(rownames(mPC2.Andrews), rep("", max_length - length(rownames(mPC2.Andrews))))
rownames_mPC12.Andrews <- c(rownames(mPC12.Andrews), rep("", max_length - length(rownames(mPC12.Andrews))))

final_table <- data.frame(
  mPC1 = rownames_mPC1.Andrews,
  mPC2 = rownames_mPC2.Andrews,
  mPC1_and_PC2 = rownames_mPC12.Andrews
)

final_table

3.1.0.2 Covariables en outliers

Metemos una nueva variable en las covariables dependiendo de si son outliers de PC1 o del PC2 o de ninguno.

plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    filt <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_PCA_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_PCA_Andrews == "mPC1" | sampleset_PCA_Andrews == "mPC2" | sampleset_PCA_Andrews == "mPC1 and mPC2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      group_by(across(all_of(col_name))) %>% 
      mutate(suma.total = sum(percentage)) %>%
      ungroup()
    
    p <- ggplot(filt, aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA_Andrews")) +
        geom_bar(stat="identity", position=position_dodge()) +
        geom_text(aes(label=paste0(round(suma.total, 1), " %"), y = suma.total), label.size = 10, color = "black") +
        labs(x = names(covs2)[i], y = "%", fill = "Sample Set PCA") +
        theme_minimal() 
    
    if (length(unique(covs2[[i]])) >= 2) {
      p <- p + 
        geom_segment(aes(x = 1, xend = 1, y = 0, yend = suma.total[1]-1), color = "red", size = 3) 
        p <- p + geom_segment(aes(x = 2, xend = 2, y = 0, yend = suma.total[3]-1), color = "red", size = 3)
      if (length(unique(covs2[[i]])) >= 3){
        p <- p + geom_segment(aes(x = 3, xend = 3, y = 0, yend = suma.total[5]-1), color = "red", size = 3)
        if (length(unique(covs2[[i]])) >= 4) {
          p <- p + geom_segment(aes(x = 4, xend = 4, y = 0, yend = suma.total[7]-1), color = "red", size = 3)
          if (length(unique(covs2[[i]])) >= 5) {
            p <- p + geom_segment(aes(x = 5, xend = 5, y = 0, yend = suma.total[9]-1), color = "red", size = 3)
          }
        }
      }
    } 
  }
    
    plots[[colnames(covs2)[i]]] <- p
  
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

pca.data.Andrews <- data.frame(pca.AD.Andrews$x)[,1:2]
colnames(pca.data.Andrews) <- c("PCA1.Andrews", "PCA2.Andrews")

covs2 <- merge(covs2, pca.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.pca.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1.Andrews) | mrna_id %in% rownames(outlierspc2.Andrews), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 10, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.pca.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.pca.Andrews$study, plots.pca.Andrews$braaksc, ncol=1)

grid.arrange(plots.pca.Andrews$ceradsc, plots.pca.Andrews$cogdx, ncol=1)

grid.arrange(plots.pca.Andrews$apoe_genotype, plots.pca.Andrews$neuroStatus, ncol=1)

3.2 tSNE

set.seed(1234)
tsne.Andrews <- Rtsne(mat.expre.AD.Andrews, dims = 2, theta = 0.0)

tsne.data.Andrews <- as.data.frame(tsne.Andrews$Y)
row.names(tsne.data.Andrews) <- row.names(mat.expre.AD.Andrews)
tsne.Andrews.covs <- merge(tsne.data.Andrews, covs, by = "row.names")
tsne.Andrews.covs$Row.names <- NULL
row.names(tsne.Andrews.covs) <- tsne.Andrews.covs$mrna_id

3.2.1 Seleccionamos outliers

rownames(tsne.data.Andrews) <- rownames(mat.expre.AD.Andrews)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data.Andrews[,1])
sd.tsne1 <- sd(tsne.data.Andrews[,1])
mean.tsne2 <- mean(tsne.data.Andrews[,2])
sd.tsne2 <- sd(tsne.data.Andrews[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1.Andrews <- tsne.data.Andrews[abs(tsne.data.Andrews[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2.Andrews <-tsne.data.Andrews[abs(tsne.data.Andrews[,2] - mean.tsne2) > 2 * sd.tsne2, ]

mtSNE1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews),]
mtSNE2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews),]
mtSNE12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews)),]

not_in_tSNE12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews))),]

3.2.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mtSNE1.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1"
  }
  if (i %in% rownames(mtSNE2.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE2"
  }
  if (i %in% rownames(mtSNE12.Andrews)){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1 and mtSNE2"
  }
  if (!(i %in% rownames(mtSNE1.Andrews)) & !(i %in% rownames(mtSNE2.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_tSNE_Andrews <- as.factor(covs2$sampleset_tSNE_Andrews)
data.frame(table(covs2$sampleset_tSNE_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1.Andrews)), length(rownames(mtSNE2.Andrews)), length(rownames(mtSNE12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1.Andrews <- c(rownames(mtSNE1.Andrews), rep("", max_length - length(rownames(mtSNE1.Andrews))))
rownames_tsne2.Andrews <- c(rownames(mtSNE2.Andrews), rep("", max_length - length(rownames(mtSNE2.Andrews))))
rownames_tsne12.Andrews <- c(rownames(mtSNE12.Andrews), rep("", max_length - length(rownames(mtSNE12.Andrews))))

final_table <- data.frame(
  mtSNE1 = rownames_tsne1.Andrews,
  mtSNE2 = rownames_tsne2.Andrews,
  mtSNE1_and_mtSNE2 = rownames_tsne12.Andrews
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_tSNE_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_tSNE_Andrews == "mtSNE1" | sampleset_tSNE_Andrews == "mtSNE2" | sampleset_tSNE_Andrews == "mtSNE1 and mtSNE2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE_Andrews")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(tsne.data.Andrews) <- c("tSNE1.Andrews", "tSNE2.Andrews")

covs2 <- merge(covs2, tsne.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.tsne.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.tsne1.Andrews) | covs2$mrna_id %in% rownames(outliers.tsne2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.tsne.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.tsne.Andrews$study, plots.tsne.Andrews$braaksc, ncol=1)

grid.arrange(plots.tsne.Andrews$ceradsc, plots.tsne.Andrews$cogdx, ncol=1)

grid.arrange(plots.tsne.Andrews$apoe_genotype, plots.tsne.Andrews$neuroStatus, ncol=1)

3.3 UMAP

local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad.Andrews <- umap(mat.expre.AD.Andrews,random_stage=1234, local.config)

umap.data.Andrews <- as.data.frame(umap.ad.Andrews$layout)
row.names(umap.data.Andrews) <- row.names(mat.expre.AD.Andrews)
umap.data.covs.Andrews <- merge(umap.data.Andrews, covs2, by = "row.names")
umap.data.covs.Andrews$Row.names <- NULL
row.names(umap.data.covs.Andrews) <- umap.data.covs.Andrews$mrna_id

3.3.1 Seleccionamos outliers

rownames(umap.data.Andrews) <- rownames(mat.expre.AD.Andrews)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data.Andrews[,1])
sd.umap1 <- sd(umap.data.Andrews[,1])
mean.umap2 <- mean(umap.data.Andrews[,2])
sd.umap2 <- sd(umap.data.Andrews[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1.Andrews <- umap.data.Andrews[abs(umap.data.Andrews[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2.Andrews <-umap.data.Andrews[abs(umap.data.Andrews[,2] - mean.umap2) > 2 * sd.umap2, ]

mUMAP1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1.Andrews),]
mUMAP2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2.Andrews),]
mUMAP12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews)),]

not_in_UMAP12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews))),]

3.3.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mUMAP1.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1"
  }
  if (i %in% rownames(mUMAP2.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP2"
  }
  if (i %in% rownames(mUMAP12.Andrews)){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1 and mUMAP2"
  }
  if (!(i %in% rownames(mUMAP1.Andrews)) & !(i %in% rownames(mUMAP2.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_UMAP_Andrews <- as.factor(covs2$sampleset_UMAP_Andrews)
data.frame(table(covs2$sampleset_UMAP_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1.Andrews)), length(rownames(mUMAP2.Andrews)), length(rownames(mUMAP12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1.Andrews <- c(rownames(mUMAP1.Andrews), rep("", max_length - length(rownames(mUMAP1.Andrews))))
rownames_umap2.Andrews <- c(rownames(mUMAP2.Andrews), rep("", max_length - length(rownames(mUMAP2.Andrews))))
rownames_umap12.Andrews <- c(rownames(mUMAP12.Andrews), rep("", max_length - length(rownames(mUMAP12.Andrews))))

final_table <- data.frame(
  mUMAP1 = rownames_umap1.Andrews,
  mUMAP2 = rownames_umap2.Andrews,
  mUMAP1_and_mUMAP2 = rownames_umap12.Andrews
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_UMAP_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_UMAP_Andrews == "mUMAP1" | sampleset_UMAP_Andrews == "mUMAP2" | sampleset_UMAP_Andrews == "mUMAP1 and mUMAP2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP_Andrews")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(umap.data.Andrews) <- c("UMAP1.Andrews", "UMAP2.Andrews")

covs2 <- merge(covs2, umap.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.umap.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.umap1.Andrews) | covs2$mrna_id %in% rownames(outliers.umap2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, 
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.umap.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.umap.Andrews$study, plots.umap.Andrews$braaksc, ncol=1)

grid.arrange(plots.umap.Andrews$ceradsc, plots.umap.Andrews$cogdx, ncol=1)

grid.arrange(plots.umap.Andrews$apoe_genotype, plots.umap.Andrews$neuroStatus, ncol=1)

4 Comparaciones

4.1 Entre aproximaciones

grid.arrange(plots.pca.kegg$neuroStatus, plots.pca.Andrews$neuroStatus,plots.tsne.kegg$neuroStatus, plots.tsne.Andrews$neuroStatus, plots.umap.kegg$neuroStatus, plots.umap.Andrews$neuroStatus,ncol = 2)

4.2 Muestras entre técnicas

all.outliers <- list(PCA.KEGG = c(rownames(outlierspc1_scaled), rownames(outlierspc2_scaled)),
                     tSNE.KEGG = c(rownames(outliers.tsne1), rownames(outliers.tsne2)),
                     UMAP.KEGG = c(rownames(outliers.umap1), rownames(outliers.umap2)),
                     PCA.Andrews = c(rownames(outlierspc1.Andrews), rownames(outlierspc2.Andrews)),
                     tSNE.Andrews = c(rownames(outliers.tsne1.Andrews), rownames(outliers.tsne2.Andrews)),
                     UMAP.Andrews = c(rownames(outliers.umap1.Andrews), rownames(outliers.umap2.Andrews))
)

max_length <- max(sapply(all.outliers, length))

# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
  length(x) <- max_length  # Esto rellenará con NA si la lista es más corta que max_length
  x
}

# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)

# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)

all.outliers
# Convertimos el data frame a un vector para hacer la tabla
all_values.outliers <- unlist(all.outliers, use.names = FALSE)

# Usamos table() para contar las frecuencias de cada muestra
table_values <- table(all_values.outliers)

# Filtramos para mostrar solo las muestras que se repiten
repeated_samples <- names(table_values[table_values > 1])

# Unlist with names to track the original column of each value
all_values_named <- unlist(all.outliers, use.names = TRUE)

# Extract unique non-NA samples
unique_samples <- unique(all_values_named[!is.na(all_values_named)])

# Create an empty matrix to fill with presence (1) or absence (0)
incidence_matrix <- matrix(0, nrow = length(unique_samples), ncol = length(all.outliers))
rownames(incidence_matrix) <- unique_samples
colnames(incidence_matrix) <- names(all.outliers)

# Populate the incidence matrix
for (col in names(all.outliers)) {
  present_samples <- all.outliers[[col]][!is.na(all.outliers[[col]])]  # Extract non-NA samples from column
  incidence_matrix[rownames(incidence_matrix) %in% present_samples, col] <- 1
}

# Melt the incidence matrix for ggplot
melted_incidence_matrix <- melt(incidence_matrix, varnames = c("sample", "column"), value.name = "presence")

colnames(melted_incidence_matrix)[3] <- "presence"

# Convertir la matriz de incidencia en un data frame
incidence_df <- as.data.frame(incidence_matrix)
# Crear un data frame para contar y listar las columnas de aparición
summary_df <- data.frame(
  sample = rownames(incidence_df),
  count = rowSums(incidence_df),
  columns = apply(incidence_df, 1, function(row) paste(names(df)[as.logical(row)], collapse = ", "))
)

# Ordenar el summary_df para mejorar la visualización
summary_df <- summary_df %>% 
  arrange(desc(count)) %>% 
  mutate(sample = factor(sample, levels = sample))

# Crear el gráfico de barras
ggplot(summary_df, aes(x = sample, y = count, fill = count)) +  # Mostrar solo las primeras 20 muestras para claridad
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  coord_flip() +  # Invertir el gráfico para mejor visualización de etiquetas
  labs(x = "Sample", y = "Count of Appearances", title = "Sample Appearances Across dimensions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title.x = element_text(size = 12, face = "bold"),
        axis.title.y = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5))

# Calcular el número total de presencias para cada muestra
total_presences <- rowSums(incidence_matrix)
melted_incidence_matrix$total_presence <- total_presences[as.character(melted_incidence_matrix$sample)]

melted_incidence_matrix$column <- gsub(pattern = "outliers.", replacement = "", x = melted_incidence_matrix$column)

melted_incidence_matrix$sample <- gsub(pattern = "_.*", replacement = "", x = melted_incidence_matrix$sample)

filtered_data <- melted_incidence_matrix %>% 
  filter(presence == 1) %>%
  arrange(desc(total_presence)) %>%
  mutate(sample = factor(sample, levels = unique(sample)))

ggplot(filtered_data, aes(x = column, y = sample, size = total_presence)) +
  geom_point(color = "steelblue") +
  scale_size(range = c(2, 6), guide = "none") +  # Adjustable size scale with no size legend
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right") +
  labs(y = "Sample", x= "Dimension", title = "Presence of Samples Across Dimensions", size = "Total Presence")

4.3 Diagrama de Venn

lista.outliers <- lapply(all.outliers, function(x) unique(na.omit(x)))
venn.plot <- venn.diagram(
  x = lista.outliers[1:3],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0, 0), #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn KEGG geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"), 
          x = 0.5, 
          y = 0.3, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.68, 
          y = 0.33, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.365, 
          y = 0.25, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[4:6],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1, -0.05), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn Andrews geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

interseccion_ABC <- Reduce(intersect, lista.outliers[4:6])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"), 
          x = 0.58, 
          y = 0.35, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AC <- Reduce(intersect, lista.outliers[c(4,6)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.42, 
          y = 0.35, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_BC <- Reduce(intersect, lista.outliers[5:6])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.68, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AB <- Reduce(intersect, lista.outliers[4:5])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.56, 
          y = 0.7, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(1,4)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn PCA geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(1,4)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.58, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(2,5)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn tSNE geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(2,5)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.63, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(3,6)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn UMAP geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(3,6)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.52, 
          y = 0.3, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))